import os
import sys
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '../..'))

from itertools import product

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from python.tools import (
    clean_folder
)

from python.construct_datasets.functions import (
    get_inflation_forecasts
)

################
## Parameters ##
################

file_name_ind_forecasts = './construct_datasets/input/individual_pgdp.xlsx'
file_name_actuals = './construct_datasets/input/p_first_second_third.xlsx'
output_folder = './construct_datasets/output/individual_dataset'
temp_folder = './construct_datasets/temp'
K = 3 # Number of lags for JK shocks
min_T = 10 # Minimum number of observations for a 
           # forecaster to be included when calculating JK shocks

#######################
## Construct dataset ##
#######################

clean_folder(output_folder)

df_ind = pd.read_excel(file_name_ind_forecasts)
df_actuals = pd.read_excel(file_name_actuals, sheet_name = 'DATA', skiprows = 4)
df = df_ind.copy()

# Create date variable
df['DATE'] = pd.to_datetime(df['YEAR'].map(str) + 'Q' + df['QUARTER'].map(str))

# Construct inflation forecasts
df = get_inflation_forecasts(df, melt_id_vars = ['DATE', 'ID'])

# Add back industry 
df_ind['DATE'] = df_ind['YEAR'].map(str) + ':0' + df_ind['QUARTER'].map(str)
df = pd.merge(df, df_ind.loc[:, ['DATE', 'ID', 'INDUSTRY']], 
              left_on = ['DATE', 'ID'], right_on = ['DATE', 'ID'], how = 'left')

# Data for 1995:Q4 is missing for first release, use second release
df_actuals.loc[df_actuals['Date'] == '1995:Q4', 'First'] = df_actuals.loc[df_actuals['Date'] == '1995:Q4', 'Second']
df_actuals.rename(columns = {'Date': 'DATE',
                             'First': 'Realiz1'}, inplace = True)
df_actuals['DATE'] = df_actuals['DATE'].map(lambda x: x.replace('Q', '0'))

# Merge forecasts and actuals
df = pd.merge(df, df_actuals.loc[:, ['DATE', 'Realiz1']], on = 'DATE', how = 'left')

# Export balanced panel, filling missing dates with NA
# Export balanced panel, filling missing dates with NA
unique_ID = list(df['ID'].unique())
unique_dates = list(pd.Series(df['DATE'].unique()).sort_values().reset_index(drop = True))
df_balanced = pd.DataFrame(list(product(unique_ID, unique_dates)), columns = ['ID', 'DATE'])
df_balanced = df_balanced.merge(df, how = 'left', on = ['ID', 'DATE'])

#########################
## Calculate JK shocks ##
#########################
df_temp = df.copy()
df_consensus = pd.read_csv('{}/consensus_dataset.csv'.format(temp_folder))
df_consensus = df_consensus[['DATE', 'SPFfor_Step2']].copy()
df_consensus.rename(columns = {'SPFfor_Step2': 'SPFfor_Step2_cons'},
                    inplace = True)

# Calculate lags
for ll in range(1, K + 1):
    df_consensus['SPFfor_Step2_cons_LAG{}'.format(ll)] = df_consensus['SPFfor_Step2_cons'].shift(ll)
df_temp = pd.merge(df_temp, df_consensus, on = 'DATE', how = 'left')

# Only calculate JK shocks for forecasters
# that make at least a given number of forecasts
df_num_obs = df_temp.groupby('ID').count()['SPFfor_Step2'].reset_index()
df_num_obs.rename(columns = {'SPFfor_Step2': 'Nfor_Step2'},
               inplace = True)
df_temp = pd.merge(df_temp, df_num_obs, on = 'ID', how = 'left')
mask = df_temp['Nfor_Step2'] >= min_T
df_temp = df_temp.loc[mask, ].copy()

df_temp['JK_shock'] = np.nan
for ID in df_temp['ID'].unique():
    mask = df_temp['ID'] == ID
    mod = smf.ols('SPFfor_Step2 ~ SPFfor_Step2_cons + SPFfor_Step2_cons_LAG1 \
        + SPFfor_Step2_cons_LAG2 + SPFfor_Step2_cons_LAG3', data = df_temp.loc[mask, ]).fit()
    df_temp.loc[mask, 'JK_shock'] = mod.resid

df_temp = df_temp[['ID', 'DATE', 'JK_shock']].copy()

# Merge in JK shocks
df_balanced = pd.merge(df_balanced, df_temp, on = ['ID', 'DATE'], how = 'left')

# Save output to file
df_balanced.to_csv('{}/individual_dataset.csv'.format(output_folder), index = False)